library(readxl)
sm <- read_excel("/Users/user/Downloads/social_media_cleaned.xlsx")
str(sm)
## tibble [21 × 13] (S3: tbl_df/tbl/data.frame)
## $ character : chr [1:21] "masinl" "peace" "Patty" "Bunny" ...
## $ Instagram : num [1:21] 3.5 7.73 3.77 5.38 12 ...
## $ LinkedIn : num [1:21] 4 5.2 7 5.317 0.583 ...
## $ SnapChat : num [1:21] 1 3.683 0.533 1.3 0 ...
## $ Twitter : num [1:21] 5 0 0 0 0.667 ...
## $ Whatsapp : num [1:21] 1 4.18 9.83 5.3 3 ...
## $ youtube : num [1:21] 2.5 4.25 1.85 2 3.5 7 3 2 4 3 ...
## $ OTT : num [1:21] 14.5 0 2 2 2 3 0 3 3 0 ...
## $ Reddit : num [1:21] 2.5 0 0 0 1 0 0 0 0 0 ...
## $ Trouble_falling_asleep: num [1:21] 0 1 0 0 1 0 0 1 0 0 ...
## $ productivity : num [1:21] 1 1 1 1 1 1 1 1 1 0 ...
## $ Tired_morning : num [1:21] 0 0 0 0 1 0 1 1 0 0 ...
## $ weekenergy : num [1:21] 3 3 4 4 3 5 4 3 3 2 ...
#The data was generated based on the usage of social_media apps (in hours) of our MVA class students and average of three weeks data was consolidated and taken for the analysis.
#Dependent Variables:Trouble_falling_asleep,productivity,Tired_morning,weekenergy
#Independent Variables:Instagram,LinkedIn,SnapChat,Twitter,Whatsapp,youtube,OTT,Reddit
#About the dataset * Character ID: Unique ID of each student’s data entry. * Instagram: Instagram app usage duration measured in hours. * LinkedIn:LinkenIN app usage duration measured in hours. * Snapchat:Snapchat app usage duration measured in hours. * Twitter Usage:Twitter app usage duration measured in hours. * Whatsapp Usage:Whatsapp app usage duration measured in hours. * Youtube Usage:Youtube app usage duration measured in hours. * OTT Usage: Over-the-Top(OTT) media services usage duration in hours. * Reddit Usage:Reddit app usage duration measured in hours. * Trouble Falling Asleep: Indicates whether the student reported having trouble falling asleep (0: No, 1: Yes). * Productivity: student’s mood and productivity level(0: Bad, 1: Good) * Tiredness upon Waking Up in the Morning: Indicates the level of tiredness the student reported upon waking up in the morning(0: Low, 1: High). * Weekenergy: Indicates the level of energy the student felt entire week measured on a scale of 5.(where 5 :High and 1:low)
To understand the impact of various social media apps usage on the lifestyle pattern of the students.
sm1 <- sm[, 2:13]
sm1
## # A tibble: 21 × 12
## Instagram LinkedIn SnapChat Twitter Whatsapp youtube OTT Reddit
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.5 4 1 5 1 2.5 14.5 2.5
## 2 7.73 5.2 3.68 0 4.18 4.25 0 0
## 3 3.77 7 0.533 0 9.83 1.85 2 0
## 4 5.38 5.32 1.3 0 5.3 2 2 0
## 5 12 0.583 0 0.667 3 3.5 2 1
## 6 2.33 7 0.467 0 12 7 3 0
## 7 5.37 4 0 0 6 3 0 0
## 8 7 4 3 0 10 2 3 0
## 9 8.65 10 3.83 0 6.15 4 3 0
## 10 0.167 0 0 0 1 3 0 0
## # ℹ 11 more rows
## # ℹ 4 more variables: Trouble_falling_asleep <dbl>, productivity <dbl>,
## # Tired_morning <dbl>, weekenergy <dbl>
summary(sm1)
## Instagram LinkedIn SnapChat Twitter
## Min. : 0.1667 Min. : 0.000 Min. : 0.0000 Min. :0.0000
## 1st Qu.: 3.7667 1st Qu.: 1.917 1st Qu.: 0.0000 1st Qu.:0.0000
## Median : 5.3833 Median : 3.917 Median : 0.5333 Median :0.0000
## Mean : 5.9230 Mean : 3.624 Mean : 1.9738 Mean :0.5802
## 3rd Qu.: 7.0000 3rd Qu.: 5.000 3rd Qu.: 1.8667 3rd Qu.:0.2667
## Max. :15.0167 Max. :10.000 Max. :14.8667 Max. :5.0000
## Whatsapp youtube OTT Reddit
## Min. : 1.000 Min. :0.000 Min. : 0.000 Min. :0.0000
## 1st Qu.: 3.667 1st Qu.:2.000 1st Qu.: 0.000 1st Qu.:0.0000
## Median : 6.000 Median :3.000 Median : 1.683 Median :0.0000
## Mean : 6.430 Mean :2.973 Mean : 2.361 Mean :0.5243
## 3rd Qu.: 8.917 3rd Qu.:4.000 3rd Qu.: 2.467 3rd Qu.:0.0000
## Max. :15.350 Max. :7.000 Max. :14.500 Max. :7.0000
## Trouble_falling_asleep productivity Tired_morning weekenergy
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :2.000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:3.000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :3.000
## Mean :0.3333 Mean :0.9524 Mean :0.3333 Mean :3.381
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:4.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :5.000
#The Summary Statistics helps us to understand the spread of the data.
#Calculating mean and covariance
colmean <- colMeans(sm1)
colmean
## Instagram LinkedIn SnapChat
## 5.9230159 3.6239683 1.9738095
## Twitter Whatsapp youtube
## 0.5801587 6.4295238 2.9725397
## OTT Reddit Trouble_falling_asleep
## 2.3607937 0.5242857 0.3333333
## productivity Tired_morning weekenergy
## 0.9523810 0.3333333 3.3809524
covariance <- cov(sm1)
covariance
## Instagram LinkedIn SnapChat Twitter
## Instagram 12.3713466 0.84502910 3.53067460 -0.87639550
## LinkedIn 0.8450291 6.12742513 0.21894246 -0.41554511
## SnapChat 3.5306746 0.21894246 12.00701587 -0.80480952
## Twitter -0.8763955 -0.41554511 -0.80480952 1.66576720
## Whatsapp 5.3507421 2.28152698 1.13048413 -2.57754048
## youtube 2.0167692 1.94490608 -0.96373016 -0.42199431
## OTT 3.3204392 1.62114392 1.60984127 2.53849431
## Reddit -0.4179702 -0.02756786 -0.44789881 0.33895595
## Trouble_falling_asleep 0.8586111 -0.14905556 0.79416667 -0.04888889
## productivity 0.2878175 0.18119841 0.09869048 0.02900794
## Tired_morning 0.1169444 -0.39155556 0.31666667 -0.16972222
## weekenergy -0.2242063 0.74891270 0.15464286 -0.11123016
## Whatsapp youtube OTT Reddit
## Instagram 5.35074206 2.016769180 3.3204391534 -0.41797024
## LinkedIn 2.28152698 1.944906085 1.6211439153 -0.02756786
## SnapChat 1.13048413 -0.963730159 1.6098412698 -0.44789881
## Twitter -2.57754048 -0.421994312 2.5384943122 0.33895595
## Whatsapp 16.22280476 2.601174603 1.9001670635 -0.86244286
## youtube 2.60117460 3.018996561 0.9850062169 0.07305357
## OTT 1.90016706 0.985006217 12.4655743386 1.30897143
## Reddit -0.86244286 0.073053571 1.3089714286 2.53638571
## Trouble_falling_asleep -0.12266667 0.032444444 -0.0007777778 -0.10800000
## productivity 0.27147619 -0.001373016 0.1180396825 0.02621429
## Tired_morning -0.08266667 -0.298388889 -0.3774444444 -0.10800000
## weekenergy 0.38585714 0.280484127 -0.3188174603 -0.15921429
## Trouble_falling_asleep productivity Tired_morning
## Instagram 0.8586111111 0.287817460 0.11694444
## LinkedIn -0.1490555556 0.181198413 -0.39155556
## SnapChat 0.7941666667 0.098690476 0.31666667
## Twitter -0.0488888889 0.029007937 -0.16972222
## Whatsapp -0.1226666667 0.271476190 -0.08266667
## youtube 0.0324444444 -0.001373016 -0.29838889
## OTT -0.0007777778 0.118039683 -0.37744444
## Reddit -0.1080000000 0.026214286 -0.10800000
## Trouble_falling_asleep 0.2333333333 0.016666667 0.08333333
## productivity 0.0166666667 0.047619048 0.01666667
## Tired_morning 0.0833333333 0.016666667 0.23333333
## weekenergy 0.0666666667 0.069047619 0.06666667
## weekenergy
## Instagram -0.22420635
## LinkedIn 0.74891270
## SnapChat 0.15464286
## Twitter -0.11123016
## Whatsapp 0.38585714
## youtube 0.28048413
## OTT -0.31881746
## Reddit -0.15921429
## Trouble_falling_asleep 0.06666667
## productivity 0.06904762
## Tired_morning 0.06666667
## weekenergy 0.54761905
#Compute correlation matrix
cor_matrix <- cor(sm[, 2:13])
#Create a heatmap of the correlation matrix
heatmap(cor_matrix, main = "Heatmap of Correlation Matrix")
#From covariance we can understand how two variables are linearly related. A good positive covariance indicates a strong linear relationship between the variables. I could see weekenergy and LinkedIn app usage having a good linear relationship based on covariance. This affirmation was made strong by the heatmap of correlation matrix where the pale color shows low correlation and the dark color shows high correlation among the variables.
#Mahalanobis distances for each observation
sm_d <- apply(sm1, MARGIN = 1, function(sm1)t(sm1 - colmean) %*% solve(covariance) %*% (sm1 - colmean))
sm_d
## [1] 17.082159 10.431647 8.097699 9.464283 10.004631 12.785065 6.094650
## [8] 11.009514 13.558265 19.047619 12.678089 9.556442 18.038751 4.702959
## [15] 18.706311 9.931812 10.823361 5.864670 6.454342 13.839615 11.828117
#Mahalanobis distance calculates the distance between each observation to the mean of the distribution.From mean my observation is at a distance of 13.55. 1st,10th,13th and 15th observations are far away from the mean in our dataset.Mahalanobis distance helped me to estimate the possible outliers.
#starplot
stars(sm1)
#with starplot we can immediately identify the observations with similarities. Here 3,4,6,9,12,14,17,19 users pattern of social media apps usage are a bit similar.
pairs(sm1)
#Boxplot of the apps
boxplot(sm[,2:9])
#Boxplot helps us to understand the central tendendency, spread, range and outliers in the dataset.I could see instagram and whatsapp are very predominant among all the apps.
#Visualization of Instagram and whatsapp impact on entire weeks energy
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
ggplot(sm1, aes(x =Instagram, fill = factor(weekenergy))) +
geom_density(alpha = 0.5) +
labs(title = "Plot of Instagram usage and weekenergy",
x = "Instagram Usage",
y = "Density",
fill = "entire weeks energy")
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
ggplot(sm1, aes(x =Whatsapp, fill = factor(weekenergy))) +
geom_density(alpha = 0.5) +
labs(title = "Plot of Whatsapp usage and weekenergy",
x = "Whatapp Usage",
y = "Density",
fill = "Entire weeks energy")
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
#We could see that people who are spending less time on instagram and whatsapp are having more energy the entire week.
#Visualization of Instagram and whatsapp impact on Trouble_falling_asleep
library(ggplot2)
ggplot(sm1, aes(x =Instagram, fill = factor(sm1$Trouble_falling_asleep))) +
geom_density(alpha = 0.5) +
labs(title = "Plot of Instagram usage and sleep pattern",
x = "Instagram Usage",
y = "Density",
fill = "trouble in falling asleep")
## Warning: Use of `sm1$Trouble_falling_asleep` is discouraged.
## ℹ Use `Trouble_falling_asleep` instead.
library(ggplot2)
ggplot(sm1, aes(x =Whatsapp, fill = factor(sm1$Trouble_falling_asleep))) +
geom_density(alpha = 0.5) +
labs(title = "Plot of Whatsapp usage and sleep pattern",
x = "Whatsapp Usage",
y = "Density",
fill = "trouble in falling asleep")
## Warning: Use of `sm1$Trouble_falling_asleep` is discouraged.
## ℹ Use `Trouble_falling_asleep` instead.
#There is a high impact of whatsapp and instagram usage on sleep pattern of the students.
#Visualization of Instagram and whatsapp impact on Tired mornings
library(ggplot2)
ggplot(sm1, aes(x =Instagram, fill = factor(sm1$Tired_morning))) +
geom_density(alpha = 0.5) +
labs(title = "Plot of Instagram usage and morning tiredness",
x = "Instagram Usage",
y = "Density",
fill = "Tired Morning")
## Warning: Use of `sm1$Tired_morning` is discouraged.
## ℹ Use `Tired_morning` instead.
library(ggplot2)
ggplot(sm1, aes(x =Whatsapp, fill = factor(sm1$Tired_morning))) +
geom_density(alpha = 0.5) +
labs(title = "Plot of Whatsapp usage and morning tiredness",
x = "Whatsapp Usage",
y = "Density",
fill = "Tired Morning")
## Warning: Use of `sm1$Tired_morning` is discouraged.
## ℹ Use `Tired_morning` instead.
#Instagram and whatsapp usage having a direct impact on tired mornings.
##Visualization of Instagram and whatsapp impact on productivity.
library(ggplot2)
ggplot(sm1, aes(x =Instagram, fill = factor(sm1$productivity))) +
geom_density(alpha = 0.5) +
labs(title = "Plot of Instagram usage and productivity",
x = "Instagram Usage",
y = "Density",
fill = "productivity")
## Warning: Use of `sm1$productivity` is discouraged.
## ℹ Use `productivity` instead.
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
library(ggplot2)
ggplot(sm1, aes(x = Whatsapp, fill = factor(sm1$productivity))) +
geom_density(alpha = 0.5) +
labs(title = "Plot of Whatsapp usage and productivity",
x = "Whatsapp Usage",
y = "Density",
fill = "productivity")
## Warning: Use of `sm1$productivity` is discouraged.
## ℹ Use `productivity` instead.
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
#Interestingly, stundents are productive even with high usage of instagram and whatsapp. This tells me a fact that instagram and whatsapp were used by students for good purposes also like learning new trends in technology and social world.
sm_pca <- prcomp(sm1[1:8],scale=TRUE)
sm_pca
## Standard deviations (1, .., p=8):
## [1] 1.4936735 1.3063036 1.1303414 0.9384308 0.9007674 0.7694819 0.6078953
## [8] 0.3621675
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4 PC5
## Instagram 0.43537868 -0.15356106 0.377944297 0.03258545 -0.36643468
## LinkedIn 0.35649468 -0.20991651 -0.329259391 -0.19907722 0.70255046
## SnapChat 0.15947567 0.03392372 0.717791182 0.10281712 0.51019404
## Twitter -0.39046510 -0.53815939 0.040482193 -0.33406772 -0.09484716
## Whatsapp 0.52694693 0.06079220 0.007167203 0.09132133 -0.28308505
## youtube 0.45370070 -0.20727847 -0.406762139 -0.03615301 -0.09453089
## OTT 0.08162766 -0.68423619 0.216480569 -0.12793328 -0.07632718
## Reddit -0.12361633 -0.35601755 -0.139687974 0.90062217 0.08919336
## PC6 PC7 PC8
## Instagram -0.51184307 -0.48893799 -0.08742222
## LinkedIn 0.02630366 -0.42220705 -0.09096859
## SnapChat -0.04887482 0.39304838 -0.17449187
## Twitter -0.02585534 0.04614556 -0.65794190
## Whatsapp 0.68926770 0.02263817 -0.39306576
## youtube -0.39774620 0.64519354 -0.03187491
## OTT 0.31710521 0.06567287 0.59264873
## Reddit -0.02048963 -0.07045776 -0.11831273
#From the PCA analysis, we can find the association of different apps with each principal components using which we can reduce the dimensionality of the dataset. Higher absolute values (close to 1) suggest a stronger relationship, while values closer to 0 indicate a weaker association
#For example snapchat app has strong association with PC3
summary(sm_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4937 1.3063 1.1303 0.9384 0.9008 0.76948 0.60790
## Proportion of Variance 0.2789 0.2133 0.1597 0.1101 0.1014 0.07401 0.04619
## Cumulative Proportion 0.2789 0.4922 0.6519 0.7620 0.8634 0.93741 0.98360
## PC8
## Standard deviation 0.3622
## Proportion of Variance 0.0164
## Cumulative Proportion 1.0000
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_eig(sm_pca, addlabels = TRUE)
#Proportion of total variance here tells about the variance explained by the principal components.
#From the screeplot it is evident that to cover variance above 90% we need to consider PC1 to PC6. I could interpret that PCA is not that beneficial for the social media dataset because we are allowed to discard only PC7 and PC8
fviz_pca_var(sm_pca,col.var = "cos2",
gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"),
repel = TRUE)
#From the above plot we can understand the usage of Twitter and Reddit apps are very similar among all. I could tell students are not using twitter and reddit apps that much.
# load library for factor analysis
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#Parallel analysis
fa.parallel(sm1[1:8])
## Parallel analysis suggests that the number of factors = 0 and the number of components = 0
fit.pc <- principal(sm1[1:8], nfactors=2, rotate="varimax")
fit.pc
## Principal Components Analysis
## Call: principal(r = sm1[1:8], nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## Instagram 0.68 0.01 0.463 0.54 1.0
## LinkedIn 0.59 0.11 0.359 0.64 1.1
## SnapChat 0.22 -0.11 0.059 0.94 1.5
## Twitter -0.36 0.84 0.834 0.17 1.4
## Whatsapp 0.73 -0.30 0.626 0.37 1.3
## youtube 0.73 0.07 0.533 0.47 1.0
## OTT 0.37 0.82 0.814 0.19 1.4
## Reddit -0.04 0.50 0.250 0.75 1.0
##
## RC1 RC2
## SS loadings 2.19 1.75
## Proportion Var 0.27 0.22
## Cumulative Var 0.27 0.49
## Proportion Explained 0.56 0.44
## Cumulative Proportion 0.56 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.13
## with the empirical chi square 21.06 with prob < 0.072
##
## Fit based upon off diagonal values = 0.71
#High absolute values (close to 1) indicate a strong relationship between the variable and the factor. #h2 explains how much variance of the variables are explained by the factors. #u2 indicates the amount of variance not explained by the factors #Reddit,OTT, twitter are better explained by RC2 and all other apps like Instagram,LinkedIn,SnapChat,Whatsapp,youtube are well explained by RC1.
round(fit.pc$values, 3)
## [1] 2.231 1.706 1.278 0.881 0.811 0.592 0.370 0.131
fit.pc$loadings
##
## Loadings:
## RC1 RC2
## Instagram 0.681
## LinkedIn 0.589 0.111
## SnapChat 0.216 -0.110
## Twitter -0.359 0.840
## Whatsapp 0.732 -0.300
## youtube 0.727
## OTT 0.371 0.822
## Reddit 0.498
##
## RC1 RC2
## SS loadings 2.189 1.749
## Proportion Var 0.274 0.219
## Cumulative Var 0.274 0.492
# Communalities
fit.pc$communality
## Instagram LinkedIn SnapChat Twitter Whatsapp youtube OTT
## 0.46314710 0.35873574 0.05870521 0.83436255 0.62581187 0.53256680 0.81378028
## Reddit
## 0.25038016
# Rotated factor scores, Notice the columns ordering: RC1, RC2
fit.pc
## Principal Components Analysis
## Call: principal(r = sm1[1:8], nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## Instagram 0.68 0.01 0.463 0.54 1.0
## LinkedIn 0.59 0.11 0.359 0.64 1.1
## SnapChat 0.22 -0.11 0.059 0.94 1.5
## Twitter -0.36 0.84 0.834 0.17 1.4
## Whatsapp 0.73 -0.30 0.626 0.37 1.3
## youtube 0.73 0.07 0.533 0.47 1.0
## OTT 0.37 0.82 0.814 0.19 1.4
## Reddit -0.04 0.50 0.250 0.75 1.0
##
## RC1 RC2
## SS loadings 2.19 1.75
## Proportion Var 0.27 0.22
## Cumulative Var 0.27 0.49
## Proportion Explained 0.56 0.44
## Cumulative Proportion 0.56 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.13
## with the empirical chi square 21.06 with prob < 0.072
##
## Fit based upon off diagonal values = 0.71
fit.pc$scores
## RC1 RC2
## [1,] -0.49593354 3.81935345
## [2,] 0.37453109 -0.45810128
## [3,] 0.24035060 -0.39724117
## [4,] -0.11821501 -0.28545411
## [5,] -0.05535776 0.18730345
## [6,] 1.33982873 -0.15394397
## [7,] -0.16544326 -0.57370308
## [8,] 0.36239954 -0.37616101
## [9,] 1.27403174 0.04737682
## [10,] -1.47503111 -0.57670717
## [11,] -1.47090422 0.33142379
## [12,] -0.99564293 0.65382335
## [13,] -0.64587802 -0.84729946
## [14,] -0.54403365 -0.04349169
## [15,] -0.05025441 0.89199339
## [16,] 0.46518890 -0.19297427
## [17,] 0.63875357 -0.56313735
## [18,] -0.42630523 -0.44123138
## [19,] 0.50350291 -0.59787800
## [20,] 2.63240375 0.51258979
## [21,] -1.38799166 -0.93654010
fa.plot(fit.pc) # See Correlations within Factors
#Factors that contribute to RC1 and RC2 Visualization
fa.diagram(fit.pc)
#some visualizations using the factors
#very simple structure visualization
vss(sm1)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
##
## Very Simple Structure
## Call: vss(x = sm1)
## Although the VSS complexity 1 shows 8 factors, it is probably more reasonable to think about 5 factors
## VSS complexity 2 achieves a maximimum of 0.78 with 6 factors
##
## The Velicer MAP achieves a minimum of 0.07 with 1 factors
## BIC achieves a minimum of -91.91 with 1 factors
## Sample Size adjusted BIC achieves a minimum of 7.85 with 7 factors
##
## Statistics by number of factors
## vss1 vss2 map dof chisq prob sqresid fit RMSEA BIC SABIC complex eChisq
## 1 0.32 0.00 0.065 54 72.5 0.047 13.31 0.32 0.118 -91.9 74.8 1.0 96.12
## 2 0.42 0.55 0.073 43 57.5 0.068 8.87 0.55 0.117 -73.4 59.4 1.4 54.81
## 3 0.51 0.66 0.083 33 40.8 0.165 5.81 0.70 0.094 -59.7 42.2 1.5 26.88
## 4 0.54 0.72 0.096 24 27.9 0.264 3.87 0.80 0.073 -45.2 28.9 1.6 11.30
## 5 0.55 0.74 0.119 16 19.5 0.242 2.82 0.86 0.090 -29.2 20.2 1.7 5.24
## 6 0.54 0.78 0.162 9 14.5 0.104 2.07 0.89 0.164 -12.9 14.9 1.8 2.95
## 7 0.55 0.72 0.194 3 7.7 0.052 1.81 0.91 0.269 -1.4 7.8 2.0 1.17
## 8 0.59 0.76 0.265 -2 4.9 NA 0.92 0.95 NA NA NA 1.9 0.36
## SRMR eCRMS eBIC
## 1 0.186 0.206 -68
## 2 0.141 0.174 -76
## 3 0.098 0.139 -74
## 4 0.064 0.106 -62
## 5 0.043 0.088 -43
## 6 0.033 0.088 -24
## 7 0.021 0.096 -8
## 8 0.011 NA NA
# Computing Correlation Matrix
corrm.sm <- cor(sm1)
corrm.sm
## Instagram LinkedIn SnapChat Twitter
## Instagram 1.00000000 0.097056399 0.28968877 -0.19305653
## LinkedIn 0.09705640 1.000000000 0.02552545 -0.13006846
## SnapChat 0.28968877 0.025525452 1.00000000 -0.17995686
## Twitter -0.19305653 -0.130068464 -0.17995686 1.00000000
## Whatsapp 0.37769615 0.228835623 0.08099980 -0.49583293
## youtube 0.33000187 0.452197669 -0.16006877 -0.18817769
## OTT 0.26738122 0.185492527 0.13158590 0.55707401
## Reddit -0.07461553 -0.006992884 -0.08116237 0.16490296
## Trouble_falling_asleep 0.50535856 -0.124658176 0.47446668 -0.07841779
## productivity 0.37498880 0.335447863 0.13051708 0.10299576
## Tired_morning 0.06883078 -0.327465831 0.18918923 -0.27223450
## weekenergy -0.08613906 0.408839562 0.06030774 -0.11645982
## Whatsapp youtube OTT Reddit
## Instagram 0.37769615 0.330001869 0.2673812155 -0.074615529
## LinkedIn 0.22883562 0.452197669 0.1854925268 -0.006992884
## SnapChat 0.08099980 -0.160068767 0.1315859038 -0.081162369
## Twitter -0.49583293 -0.188177691 0.5570740080 0.164902964
## Whatsapp 1.00000000 0.371685163 0.1336203696 -0.134449660
## youtube 0.37168516 1.000000000 0.1605652338 0.026399913
## OTT 0.13362037 0.160565234 1.0000000000 0.232791099
## Reddit -0.13444966 0.026399913 0.2327910994 1.000000000
## Trouble_falling_asleep -0.06304856 0.038656332 -0.0004560485 -0.140387265
## productivity 0.30887192 -0.003621212 0.1532080319 0.075429291
## Tired_morning -0.04248925 -0.355519106 -0.2213138016 -0.140387265
## weekenergy 0.12945663 0.218141205 -0.1220243994 -0.135093704
## Trouble_falling_asleep productivity Tired_morning
## Instagram 0.5053585585 0.374988801 0.06883078
## LinkedIn -0.1246581760 0.335447863 -0.32746583
## SnapChat 0.4744666806 0.130517080 0.18918923
## Twitter -0.0784177942 0.102995756 -0.27223450
## Whatsapp -0.0630485641 0.308871917 -0.04248925
## youtube 0.0386563318 -0.003621212 -0.35551911
## OTT -0.0004560485 0.153208032 -0.22131380
## Reddit -0.1403872651 0.075429291 -0.14038727
## Trouble_falling_asleep 1.0000000000 0.158113883 0.35714286
## productivity 0.1581138830 1.000000000 0.15811388
## Tired_morning 0.3571428571 0.158113883 1.00000000
## weekenergy 0.1865009616 0.427581673 0.18650096
## weekenergy
## Instagram -0.08613906
## LinkedIn 0.40883956
## SnapChat 0.06030774
## Twitter -0.11645982
## Whatsapp 0.12945663
## youtube 0.21814120
## OTT -0.12202440
## Reddit -0.13509370
## Trouble_falling_asleep 0.18650096
## productivity 0.42758167
## Tired_morning 0.18650096
## weekenergy 1.00000000
plot(corrm.sm)
social_pca <- prcomp(sm1[1:8], scale=TRUE)
summary(social_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4937 1.3063 1.1303 0.9384 0.9008 0.76948 0.60790
## Proportion of Variance 0.2789 0.2133 0.1597 0.1101 0.1014 0.07401 0.04619
## Cumulative Proportion 0.2789 0.4922 0.6519 0.7620 0.8634 0.93741 0.98360
## PC8
## Standard deviation 0.3622
## Proportion of Variance 0.0164
## Cumulative Proportion 1.0000
plot(social_pca)
#Biplot Visualization
biplot(fit.pc)
#I feel factor analysis is not beneficial for the social media data because I observed that we are missing the most part of the uniqueness of these apps by including factors and we are able to capture only a small portion of variances by using factors. #And parallel analysis screeplot indicated that the ideal number of factors for the social media data is zero. #From the component analysis we got similar results to PCA, where the apps like Instagram, whatsapp/wechat, LinkedIn, Youtube, Snapchat usages are a bit similar and high compared to OTT, Twitter and Reddit.
#Hierarchical Clustering- Dendrogram
sm_scaled <- scale(sm1)
dist_matrix <- dist(sm_scaled)
#Clustering Single
hc <- hclust(dist_matrix,method = "single")
fviz_dend(hc)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Default Clustering
hc <- hclust(dist_matrix)
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")
#Average Clustering
hc <- hclust(dist_matrix,method = "average")
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")
#By observing the above dendrogram’s k=2 clusters will be sufficient to group the entire students of the class.This is confirmed further with D index graphical representation.
#Non-Hierarchical Clustering(k-means)
num_clusters <- 2
kmeans_model <- kmeans(sm_scaled, centers = num_clusters)
# Membership for each cluster
table(kmeans_model$cluster)
##
## 1 2
## 7 14
# Principal Components
pca_result <- prcomp(sm_scaled,scale=TRUE)
pca_result
## Standard deviations (1, .., p=12):
## [1] 1.6203942 1.4754300 1.3310928 1.1776881 0.9897347 0.9245638 0.8967807
## [8] 0.7514383 0.6440153 0.4723495 0.4045767 0.1839881
##
## Rotation (n x k) = (12 x 12):
## PC1 PC2 PC3 PC4
## Instagram 0.42816828 0.040374089 -0.27669154 0.36077066
## LinkedIn 0.31458993 -0.375066753 0.14658080 -0.25512070
## SnapChat 0.24530326 0.279812642 -0.28852512 0.08458621
## Twitter -0.27287007 -0.245837113 -0.49833107 -0.25725953
## Whatsapp 0.40478489 -0.117263785 0.20574594 0.25457749
## youtube 0.30854324 -0.364491106 0.17997327 0.21460358
## OTT 0.07977957 -0.348086166 -0.52220162 0.08187832
## Reddit -0.11757332 -0.226467324 -0.22416407 -0.02858350
## Trouble_falling_asleep 0.28246439 0.358670833 -0.31587576 0.04233834
## productivity 0.35825558 -0.053909034 -0.20839294 -0.43253653
## Tired_morning 0.08137741 0.517992460 -0.03369194 -0.19614866
## weekenergy 0.29846249 0.006382348 0.16843149 -0.62103181
## PC5 PC6 PC7 PC8
## Instagram -0.04578095 0.05108928 0.28555148 -0.325087718
## LinkedIn 0.19120246 -0.10250708 -0.30832764 -0.195679386
## SnapChat 0.15885641 -0.16531309 -0.72105808 0.051423328
## Twitter 0.18303488 0.25159733 0.16826747 0.008552916
## Whatsapp -0.41429227 0.29631693 -0.11055036 0.277673487
## youtube 0.28970917 -0.24318967 0.35454996 0.237554134
## OTT -0.02365440 0.20285346 -0.10917199 0.513644260
## Reddit -0.49365028 -0.77780629 0.01903676 0.039910022
## Trouble_falling_asleep 0.36779356 -0.24340561 0.25673461 -0.033294256
## productivity -0.39794597 0.18254386 0.08750531 -0.416952523
## Tired_morning -0.27688783 0.01151556 0.21367750 0.428408387
## weekenergy 0.18024711 -0.10234403 0.07562961 0.309613656
## PC9 PC10 PC11 PC12
## Instagram -0.17782279 0.15803981 -0.54472544 0.25115168
## LinkedIn -0.58222718 -0.27931005 0.12757002 0.24512539
## SnapChat 0.16360056 0.38914908 0.08535895 0.10251068
## Twitter 0.15950829 0.11258940 0.19450385 0.59114180
## Whatsapp 0.34392387 -0.29798515 0.17846889 0.35715770
## youtube 0.03795324 0.48373698 0.35681581 -0.07438099
## OTT -0.20642806 -0.14358794 -0.15991649 -0.43221675
## Reddit 0.07648933 -0.08051441 -0.04455224 0.13738996
## Trouble_falling_asleep 0.15983503 -0.56114478 0.28533231 -0.08132826
## productivity 0.11824824 0.19017568 0.29333104 -0.35895531
## Tired_morning -0.52328057 0.17804741 0.18472916 0.19953066
## weekenergy 0.30926178 0.01575160 -0.50356016 0.05312393
# Visualize cluster and membership using first two Principal Components
fviz_cluster(list(data = pca_result$x[, 1:2], cluster = kmeans_model$cluster))
#This plot visualizes clusters and their memberships using the first two principal components.
# Visualize cluster centers for k-means
fviz_cluster(kmeans_model, data = sm_scaled, geom = "point", frame.type = "convex",
pointsize = 2, fill = "white", main = "K-means Cluster Centers")
## Warning: argument frame is deprecated; please use ellipse instead.
## Warning: argument frame.type is deprecated; please use ellipse.type instead.
# Visualize cluster and membership using first two Principal Components for k-means
pca_result <- prcomp(sm_scaled, scale = TRUE)
fviz_cluster(kmeans_model, data = pca_result$x[, 1:2], geom = "point",
pointsize = 2, fill = "white", main = "K-means Clustering Result (PCA)")
#This visualization helps to understand how the data points are grouped into clusters based on their similarities, as revealed by the PCA analysis.
library(cluster)
# Calculate silhouette information for k-means clustering
sil <- silhouette(kmeans_model$cluster, dist(sm_scaled))
# Visualize the silhouette plot for k-means clustering
fviz_silhouette(sil, main = "Silhouette Plot for K-means Clustering")
## cluster size ave.sil.width
## 1 1 7 0.00
## 2 2 14 0.19
#A higher silhouette width indicates better separation of clusters, while negative values suggest that points might be assigned to the wrong clusters. This plot helps in determining the optimal number of clusters for k-means clustering and assessing the overall clustering performance.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(NbClust)
#optimal cluster method/visualization
res.nbclust <- sm_scaled[,1:8] %>% scale() %>% NbClust(distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete", index ="all")
## Warning in pf(beale, pp, df2): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 4 proposed 8 as the best number of clusters
## * 4 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
#The Dindex suggests the optimal number of clusters according to majority rule is 2. #Through cluster analysis I am able to figure out users whose social media usage pattern is similar to mine. cluster analysis helped to group students based on hidden patterns of their social media usage based on which any further analysis can be done.
#Multiple Regression
#Performing multiple regression on the dataset
fit <- lm(sm1$Trouble_falling_asleep ~ Instagram+ LinkedIn + SnapChat + Twitter+ Whatsapp + youtube + OTT + Reddit , data=sm1)
#show the results
summary(fit)
##
## Call:
## lm(formula = sm1$Trouble_falling_asleep ~ Instagram + LinkedIn +
## SnapChat + Twitter + Whatsapp + youtube + OTT + Reddit, data = sm1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47240 -0.26171 -0.04857 0.14431 0.71319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.02935 0.36730 0.080 0.938
## Instagram 0.07008 0.03479 2.014 0.067 .
## LinkedIn -0.02846 0.04682 -0.608 0.555
## SnapChat 0.05725 0.03387 1.690 0.117
## Twitter 0.03816 0.14738 0.259 0.800
## Whatsapp -0.02922 0.03663 -0.798 0.441
## youtube 0.04025 0.07350 0.548 0.594
## OTT -0.02642 0.05020 -0.526 0.608
## Reddit -0.02379 0.06667 -0.357 0.727
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4432 on 12 degrees of freedom
## Multiple R-squared: 0.495, Adjusted R-squared: 0.1583
## F-statistic: 1.47 on 8 and 12 DF, p-value: 0.2639
#From the above summary we got p-value 0.2639 which indicates the coefficient of the predictor variable associated with this p-value is not statistically significant.The model explains approximately 49% of the variability in “trouble_falling_asleep” as indicated by the multiple R-squared value. Most of the coefficients are not statistically significant indicating weak evidence of association.
coefficients(fit)
## (Intercept) Instagram LinkedIn SnapChat Twitter Whatsapp
## 0.02935003 0.07007642 -0.02846038 0.05724962 0.03816260 -0.02922242
## youtube OTT Reddit
## 0.04025224 -0.02641988 -0.02379311
#From the above we get information about the dependent variable in equation form y=b0+ b1x1 + b2x2+…+bnxn where intercept b0=0.029, and cofficients b1=0.0700,….
#The positive coefficients for Instagram,Sanapchat, Twitter,YouTube suggest a potential positive association with trouble_falling_asleep, while negative coefficients for LinkedIn, Whatsapp,OTT and Reddit imply a negative association. The intercept represents the estimated troulbe in sleep score when all predictors are zero.
fitted(fit)
## 1 2 3 4 5 6
## 0.03767577 0.68297470 -0.14111191 0.20249080 0.85568960 -0.12780750
## 7 8 9 10 11 12
## 0.23700749 0.28681294 0.47239558 0.13256374 0.41692826 0.27284963
## 13 14 15 16 17 18
## 1.09915634 0.43420708 0.04857065 0.41826973 0.41113911 0.32212351
## 19 20 21
## 0.26171236 0.88332910 -0.20697699
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data=sm1, title="Social-Media")
plot(fit, which=1) # Residuals vs Fitted
plot(fit, which=2) # Normal Q-Q plot
#In an ideal normal distribution QQ plot, the points would fall along a straight diagonal line. However, in this plot, the points show some deviation from the diagonal,this suggests the data may not fully conform to a normal distribution and could indicate the presence of outliers or other non-normal characteristics.Identifying departures from normality can inform the choice of appropriate modeling techniques.
residuals <- residuals(fit)
residuals
## 1 2 3 4 5 6
## -0.03767577 0.31702530 0.14111191 -0.20249080 0.14431040 0.12780750
## 7 8 9 10 11 12
## -0.23700749 0.71318706 -0.47239558 -0.13256374 0.58307174 -0.27284963
## 13 14 15 16 17 18
## -0.09915634 -0.43420708 -0.04857065 0.58173027 -0.41113911 -0.32212351
## 19 20 21
## -0.26171236 0.11667090 0.20697699
#Plot residuals against fitted values to check for homoscedasticity
plot_resid_fitted <- ggplot() +
geom_point(aes(x = fitted(fit), y = residuals)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Fitted Values", y = "Residuals",
title = "Residuals vs Fitted Values Plot") +
theme_minimal()
print(plot_resid_fitted)
#The residual vs. fitted plot is a tool used to evaluate the assumptions
and adequacy of a regression model. It helps to identify whether the
model adequately captures the underlying relationships in the data or if
there are issues that need to be addressed. #The plot shows a pattern
between the fitted values and the residuals around zero, the model is
likely not appropriate.
predict.lm(fit, data.frame(Instagram=8, LinkedIn=5, SnapChat=4, Twitter=4,
Whatsapp=4, youtube=8, OTT=3, Reddit=4 ))
## 1
## 0.8600045
#Here the model predicted the trouble falling asleep value for the given values.
#Make predictions using the model
predicted <- predict(fit, newdata = sm1)
#Calculating RMSE by taking the square root of the mean of the squared differences between the actual values and the predicted values (predicted)
rmse <- sqrt(mean((sm1$Trouble_falling_asleep - predicted)^2))
rmse
## [1] 0.3350081
#Low RMSE(0.335) between 0 and 1 indicates that the models predictions are quite accurate, with small deviations from the actual values.In this case, an RMSE value of 0.335 indicates that, on average, the model’s predictions deviate from the observed values by approximately 0.335 units. A lower RMSE value indicates better performance of the model.
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:psych':
##
## logit
#Nonlinearity
# component + residual plot
crPlots(fit)
# plot studentized residuals vs. fitted values
library(car)
spreadLevelPlot(fit)
## Warning in spreadLevelPlot.lm(fit):
## 3 negative fitted values removed
##
## Suggested power transformation: 0.7663289
#The plot reveals patterns in the spread of residuals across the range of fitted values. If residuals are evenly spread, it suggests homoscedasticity. The upward trend of the curve suggests increasing variability of residuals as fitted values rise, indicating potential heteroscedasticity.
#Logistic Regression
#Logistic regression is a statistical method used for binary classification problems. It predicts the probability that a given observation belongs to one of two classes. #It gives the relationship between a binary dependent variable and independent variables.
#Load required Libraries
library(dplyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
lr <- glm(sm1$Trouble_falling_asleep ~ Instagram+ LinkedIn + SnapChat + Twitter+ Whatsapp + youtube + OTT + Reddit, data=sm1, family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(lr)
##
## Call:
## glm(formula = sm1$Trouble_falling_asleep ~ Instagram + LinkedIn +
## SnapChat + Twitter + Whatsapp + youtube + OTT + Reddit, family = "binomial",
## data = sm1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.6978 4.7433 -0.990 0.322
## Instagram 0.7655 0.5993 1.277 0.202
## LinkedIn -1.0183 1.1699 -0.870 0.384
## SnapChat 2.7338 3.1258 0.875 0.382
## Twitter 0.8471 1.4520 0.583 0.560
## Whatsapp -0.3141 0.3923 -0.801 0.423
## youtube 1.0164 1.5883 0.640 0.522
## OTT -1.5516 2.2342 -0.694 0.487
## Reddit 0.0207 0.6622 0.031 0.975
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26.734 on 20 degrees of freedom
## Residual deviance: 11.500 on 12 degrees of freedom
## AIC: 29.5
##
## Number of Fisher Scoring iterations: 9
#The logistic regression suggests that there is no significant predictor of trouble falling asleep. The intercept indicates a baseline of approximately -4.69 when all independent variables are zero.The model’s fit is modest, with slightly lower residual deviance compared to null deviance, and an AIC of 29.5. #From the above it is evident that the model is not acceptable since the independent variables are not contributing significantly to the variation in the dependent variable.
residuals(lr)
## 1 2 3 4 5
## -8.920605e-05 1.569379e-02 -3.882687e-03 -1.063736e-01 1.913654e-01
## 6 7 8 9 10
## -9.188637e-03 -2.441611e-01 1.608947e+00 -1.052751e+00 -5.442003e-01
## 11 12 13 14 15
## 9.633205e-01 -4.333049e-01 2.107342e-08 -1.323641e+00 -3.486828e-01
## 16 17 18 19 20
## 1.765154e+00 -7.920632e-01 -6.237626e-01 -5.244625e-01 3.022679e-02
## 21
## -3.160719e-02
plot(lr, which = 1)
#There is a fixed pattern in the residuals vs fitted plot which means that the selected independent variables will not explain the dependent variable well.
anova(lr)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: sm1$Trouble_falling_asleep
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 20 26.734
## Instagram 1 5.9366 19 20.797
## LinkedIn 1 0.5658 18 20.231
## SnapChat 1 3.6014 17 16.630
## Twitter 1 0.3049 16 16.325
## Whatsapp 1 2.9495 15 13.375
## youtube 1 0.3787 14 12.997
## OTT 1 1.4962 13 11.501
## Reddit 1 0.0009 12 11.500
#"Pseudo R-squared" and its p-value
ll.null <- lr$null.deviance/-2
ll.proposed <- lr$deviance/-2
(ll.null - ll.proposed) / ll.null
## [1] 0.5698455
#The pseudo R-squared value resulting from the provided code is 0.56, it suggests that the proposed model does not fit the data perfectly. This indicates that all variability in the response variable is not well explained by the predictors, implying a highly significant improvement in model.
predicted.data <- data.frame(probability.of.hd=lr$fitted.values, Trouble_falling_asleep = sm1$Trouble_falling_asleep)
predicted.data <- predicted.data[order(predicted.data$probability.of.hd, decreasing=FALSE),]
predicted.data$rank <- 1:nrow(predicted.data)
ggplot(data=predicted.data, aes(x=rank, y=probability.of.hd)) +
geom_point(aes(color=Trouble_falling_asleep), alpha=1, shape=4, stroke=2) +
xlab("Index") +
ylab("Predicted probability of getting in sleeping")
#The plot is a graphical representation of a predictive model, depicting the probability of trouble falling asleep against an index.The dotted line likely represents a fitted curve, showing how the probability changes across different index values, with a steep incline around the threshold(15). This visualization could be used to understand factors influencing trouble in sleep.
library(caret)
## Loading required package: lattice
pdata <- predict(lr,newdata=sm1,type="response" )
pdata
## 1 2 3 4 5 6
## 3.978860e-09 9.998769e-01 7.537601e-06 5.641701e-03 9.818563e-01 4.221463e-05
## 7 8 9 10 11 12
## 2.936746e-02 2.740746e-01 4.254351e-01 1.376353e-01 6.287679e-01 8.960485e-02
## 13 14 15 16 17 18
## 1.000000e+00 5.835601e-01 5.897903e-02 2.105810e-01 2.692487e-01 1.767867e-01
## 19 20 21
## 1.284922e-01 9.995433e-01 4.993826e-04
pdataF <- as.factor(ifelse(test=as.numeric(pdata>0.5) == 0, yes="0", no="1"))
sm1$Trouble_falling_asleep <- factor(sm1$Trouble_falling_asleep, levels = c("0", "1"))
levels(pdataF) <- levels(sm1$Trouble_falling_asleep)
confusionMatrix(pdataF, sm1$Trouble_falling_asleep)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 13 2
## 1 1 5
##
## Accuracy : 0.8571
## 95% CI : (0.6366, 0.9695)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.04616
##
## Kappa : 0.6667
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.9286
## Specificity : 0.7143
## Pos Pred Value : 0.8667
## Neg Pred Value : 0.8333
## Prevalence : 0.6667
## Detection Rate : 0.6190
## Detection Prevalence : 0.7143
## Balanced Accuracy : 0.8214
##
## 'Positive' Class : 0
##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc(sm1$Trouble_falling_asleep, lr$fitted.values, plot=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = sm1$Trouble_falling_asleep, predictor = lr$fitted.values, plot = TRUE)
##
## Data: lr$fitted.values in 14 controls (sm1$Trouble_falling_asleep 0) < 7 cases (sm1$Trouble_falling_asleep 1).
## Area under the curve: 0.949
#Discriminant Analysis
# Splitting the dataset into 75% training and 25% test sets
smp_size_raw <- floor(0.75 * nrow(sm1))
train_ind_raw <- sample(nrow(sm1), size = smp_size_raw)
train_raw.df <- sm[train_ind_raw, ]
test_raw.df <- sm[-train_ind_raw, ]
lda_model <- lda(train_raw.df$Trouble_falling_asleep ~ Instagram + LinkedIn + SnapChat + Twitter + Whatsapp + youtube + OTT + Reddit, data = train_raw.df)
lda_model
## Call:
## lda(train_raw.df$Trouble_falling_asleep ~ Instagram + LinkedIn +
## SnapChat + Twitter + Whatsapp + youtube + OTT + Reddit, data = train_raw.df)
##
## Prior probabilities of groups:
## 0 1
## 0.5333333 0.4666667
##
## Group means:
## Instagram LinkedIn SnapChat Twitter Whatsapp youtube OTT Reddit
## 0 4.15000 4.520833 1.181250 0.9791667 5.887500 3.097917 3.547917 1.1875000
## 1 8.37619 3.198095 4.242857 0.4404762 6.079048 3.065238 2.358571 0.2157143
##
## Coefficients of linear discriminants:
## LD1
## Instagram 0.344391244
## LinkedIn -0.185506018
## SnapChat 0.166981292
## Twitter 0.480715943
## Whatsapp -0.002749699
## youtube 0.228992485
## OTT -0.247118598
## Reddit -0.126181364
#Prior probability shows the class distribution of each class in the training data.The data is classified into 2 groups based on the peoples trouble in sleep and LD1 having few significant coefficients.Among the groups 0 and 1 people facing trouble in falling asleep is the most predominant group. #Coefficients of Linear Discriminants: The coefficients of linear discriminants represent the weights assigned to each predictor variable in the discriminant function.
summary(lda_model)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 16 -none- numeric
## scaling 8 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
##No obvious issues with class imbalance, the LDA model is likely acceptable.
plot(lda_model)
###Residual Analysis
residuals(lda_model)
## NULL
#LDA does not inherently produce residuals
###Prediction
prediction <- predict(lda_model, test_raw.df)
prediction
## $class
## [1] 0 0 1 1 1 0
## Levels: 0 1
##
## $posterior
## 0 1
## 1 0.9349435 0.065056536
## 2 0.7132359 0.286764137
## 3 0.3242192 0.675780806
## 4 0.0251472 0.974852804
## 5 0.3744611 0.625538931
## 6 0.9940401 0.005959924
##
## $x
## LD1
## 1 -0.9932902
## 2 -0.2505513
## 3 0.4462550
## 4 1.6839962
## 5 0.3525388
## 6 -2.0313388
###Accuracy
# Predict on the test set
predicted_classes <- predict(lda_model, test_raw.df)$class
# Create a confusion matrix to understand misclassifications
confusion_matrix <- table(predicted_classes, test_raw.df$Trouble_falling_asleep)
confusion_matrix
##
## predicted_classes 0
## 0 3
## 1 3
accuracy <- sum(predicted_classes == test_raw.df$Trouble_falling_asleep) / nrow(test_raw.df)
accuracy
## [1] 0.5
#Acuuracy is moderately acceptable
str(train_raw.df)
## tibble [15 × 13] (S3: tbl_df/tbl/data.frame)
## $ character : chr [1:15] "19!@s" "peace" "vp1234" "ds2134" ...
## $ Instagram : num [1:15] 7 7.733 7 0.167 6.8 ...
## $ LinkedIn : num [1:15] 4 5.2 5 0 1.92 ...
## $ SnapChat : num [1:15] 3 3.683 0.417 0 1.867 ...
## $ Twitter : num [1:15] 0 0 0 0 0 ...
## $ Whatsapp : num [1:15] 10 4.18 5 1 6.95 ...
## $ youtube : num [1:15] 2 4.25 5 3 0.8 2.5 0.54 1.85 3.5 7 ...
## $ OTT : num [1:15] 3 0 1 0 2.47 ...
## $ Reddit : num [1:15] 0 0 0.5 0 0 2.5 0.01 0 1 0 ...
## $ Trouble_falling_asleep: num [1:15] 1 1 1 0 0 0 1 0 1 0 ...
## $ productivity : num [1:15] 1 1 1 0 1 1 1 1 1 1 ...
## $ Tired_morning : num [1:15] 1 0 1 0 1 0 1 0 1 0 ...
## $ weekenergy : num [1:15] 3 3 5 2 3 3 4 4 3 5 ...
library(klaR)
attach(train_raw.df)
train_raw.df$Trouble_falling_asleep <- factor(train_raw.df$Trouble_falling_asleep)
partimat( Trouble_falling_asleep ~Instagram+LinkedIn+SnapChat+Twitter+youtube+OTT+Reddit, data=train_raw.df, method="lda")
I can conclude that by using MVA models I am able to compute the effect of each independent varible’s on the dependent variables.This really helped me to understand the impact of various social media apps usage on the healthy/unhealthy lifestyle of the students which is my main goal of conducting this analysis. From exploratory data analysis it is clearly evident that Instagram and Whatsapp are the two predominant apps among students and hours spent on it having a direct relationship with sleep deprivation, diminished productivity, poor energy and tiredness. And also it was proven from my Exploratory data analysis that many students are using social media apps for a good reason. Either good or bad having control over the usage of social media apps is beneficial for both physical and mental health of the students.